One month of data has been gathered about car ads. eBay Classifieds is all about brining buyers and sellers together to make a great deal. Sellers place ads, and buyers look for these ads by browsing or searching on the site. When a buyer finds an ad they find interesting in the result list and clicks it, we call this a ‘View Item page’- a VIP view. When a buyer proceeds to contact a seller to get more info or strike a deal, we call this a lead. They can do so by calling (Phone click), asking a question over email (ASQ: ask seller Question), clicking out to a seller’s website (URL_CLICK) or placing a bid.
Lets check what we can infer from this data and what type of predictive model we will be able to build.
Metadata:
Lets begin by reading in and examining our data
# Importing the Data
## Setup how the classes will be read in
class <- c( "numeric", "numeric", "numeric", "character", "character", "numeric", "numeric",
"numeric", "numeric", "numeric", "numeric", "numeric", "factor", "character", "numeric",
"date", "numeric", "numeric", "character", "numeric", "numeric", "character")
path <- c("/Users/mustafaozturk/Desktop/eBay Case/DataSet/cars_dataset_may2019.csv")
## Read in and examine the data
cars <- data.table::fread(path, colClasses = class)
initial findings:
#In some cases like below telclicks, bids and webclicks contains NA. It can be converted to 0
cars[src_ad_id==1063865800,]
## src_ad_id telclicks bids kleur carrosserie kmstand days_live photo_cnt
## 1: 1063865800 NA NA Zwart MPV 130773 0 7
## aantaldeuren n_asq bouwjaar emissie energielabel brand l2 ad_start_dt
## 1: 5 0 2005 190 D MAZDA None 11/15/2016
## vermogen webclicks model aantalstoelen price group
## 1: 85 NA 5 7 59500 B
#Converting NA to 0
cars$telclicks <- ifelse(is.na(cars$telclicks), "0", cars$telclicks)
cars$bids <- ifelse(is.na(cars$bids), "0", cars$bids)
cars$webclicks <- ifelse(is.na(cars$webclicks), "0", cars$webclicks)
#In some rows of ad_starts contains "." it should change to "/" when I checked the excel file.
#But it has been taken care by R
table(cars$ad_start_dt)
##
## 11/1/2016 11/10/2016 11/11/2016 11/12/2016 11/13/2016 11/14/2016
## 7138 7054 7685 5809 3298 6302
## 11/15/2016 11/16/2016 11/17/2016 11/18/2016 11/19/2016 11/2/2016
## 6319 6108 7365 7092 5625 5670
## 11/20/2016 11/21/2016 11/22/2016 11/23/2016 11/24/2016 11/25/2016
## 3112 6695 6786 5464 6532 6988
## 11/26/2016 11/27/2016 11/28/2016 11/29/2016 11/3/2016 11/30/2016
## 5785 3205 8460 6111 7103 5540
## 11/4/2016 11/5/2016 11/6/2016 11/7/2016 11/8/2016 11/9/2016
## 7987 5710 2986 6309 6930 5894
#Changing NA, None and ? to .
cars[cars == "None"] <- NA
cars[cars == "?"] <- NA
# Summary Statistics
str(cars)
## Classes 'data.table' and 'data.frame': 183062 obs. of 22 variables:
## $ src_ad_id : num 1.12e+09 1.02e+09 1.09e+09 1.15e+09 1.06e+09 ...
## $ telclicks : chr "1" "0" "0" "3" ...
## $ bids : chr "0" "0" "0" "0" ...
## $ kleur : chr "Zilver of Grijs" "Zilver of Grijs" "Zilver of Grijs" "Zwart" ...
## $ carrosserie : chr "Hatchback (3/5-deurs)" "Hatchback (3/5-deurs)" "Stationwagon" "Sedan (2/4-deurs)" ...
## $ kmstand : num 44958 25072 301409 194481 238101 ...
## $ days_live : num 31 31 31 31 9 8 2 31 31 100 ...
## $ photo_cnt : num 16 22 12 21 8 13 24 14 18 11 ...
## $ aantaldeuren : chr "3" "5" "5" "4" ...
## $ n_asq : num 0 1 0 1 0 9 0 0 0 10 ...
## $ bouwjaar : num 2013 2015 2004 2007 2002 ...
## $ emissie : chr "105" "122" "132" "314" ...
## $ energielabel : Factor w/ 8 levels "?","A","B","C",..: 2 4 3 8 5 4 4 4 4 3 ...
## $ brand : chr "VOLKSWAGEN" "VOLKSWAGEN" "OPEL" "AUDI" ...
## $ l2 : chr NA "157" NA "93" ...
## $ ad_start_dt : chr "11/26/2016" "11/11/2016" "11/2/2016" "11/11/2016" ...
## $ vermogen : num 44 92 59 331 79 74 97 81 80 103 ...
## $ webclicks : chr "2" "4" "3" "13" ...
## $ model : chr "UP" "Golf" "Astra" "S8" ...
## $ aantalstoelen: chr "4" "5" "5" "5" ...
## $ price : num 79500 189500 27500 199000 12500 ...
## $ group : chr "A" "A" "B" "A" ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "index")= int
## ..- attr(*, "__src_ad_id")= int 97692 149657 33271 112977 94027 107736 57116 55810 109457 3443 ...
# Dimensions
dim(cars)
## [1] 183062 22
# Control the test group and check data
table(cars$group)
##
## A B NULL
## 94281 80168 8613
# 8.613 NULL for Group
cars[cars == "NULL"] <- NA
# Data Exploration
## Missing data
MissingValues <- cars %>% summarise_all(funs(sum(is.na(.))/n()))
MissingValues <- gather(MissingValues, key = "feature", value = "MissingPct")
MissingValues %>%
ggplot(aes(x = reorder(feature, - MissingPct), y = MissingPct)) +
geom_bar(stat = "identity", fill = "red") +
coord_flip() + theme_bw()
# A very small percentage of data is NA except l2, energielabel and carrosserie
# May need to do any KNN/RF imputations for l2, energielabel and carrosserie
# Creating new variables
## Create an age variable from date information
## ad start date + days live could be used instead of system time but lets the date as of the date of analysis
## transform some features by year using the new age variable in order to boost our predictive model power
cars <- cars %>% mutate(age = as.numeric(format(Sys.Date(), "%Y")) -
as.integer(cars$bouwjaar),
annual_emissions = as.numeric(emissie)/age,
annual_kms = kmstand / age)
# create an age grouping
cars <- cars %>% mutate(ageGroup = ifelse(age<= 3, "(<=3)",
ifelse(3 < age & age <= 6, "(4-6)",
ifelse(5 < age & age <= 10, "(7-10)",
ifelse(10 < age & age <= 15, "(11-15)",
ifelse(15 < age & age <= 20, "(16-20)",
"(20+)"))))))
cars$ageGroup <- as.factor(cars$ageGroup)
# Visual Exploration
## Now lets do a visual exploration of our new feature
## View distribution of variable
## Most packed around the 5 Year mark
cars %>%
ggplot(aes(x=age))+geom_line(stat="density", color="red", size=1.2)+theme_bw()
# Histogram for a view from another angle by year
ggplot(aes(cars$age), data=cars) +
geom_histogram(color='white', fill='lightblue') +
scale_x_continuous(limit=c(0, 35), breaks=seq(0, 35, 2)) +
labs(x= 'Car Age', y= 'Number of Cars', title= 'Car Age Histogram')
# See if we can unconver anything by segregating by car type
# Vehicle type have a broad spectrum of ages
ggplot(aes(x= carrosserie, y= age), data=cars) +
geom_boxplot(fill="lightblue", color='red') +
geom_boxplot(aes(fill = carrosserie)) +
stat_summary(fun.y = mean, geom="point", size=2) +
labs(x= 'Vehicle Type', y= 'Age') +
ggtitle('Age vs. Vehicle Type')
# Examine Car Types
## We have a very high amount of "Hatchbacks" in our dataset
ggplot(cars, aes(x=carrosserie, fill = carrosserie)) +
geom_bar() +
labs(x= 'Vehicle Type', y= 'Number of Cars') +
ggtitle('Vehicle Type Frequency Diagram')
# How long before a car is sold?
## Most cars carry on for the 30+ days
ggplot(data=cars, aes(cars$days_live)) +
geom_histogram(breaks=seq(0, 35, by = 5),
col="red",
fill="green",
alpha = .2) +
labs(title="Histogram for Days Live") +
labs(x="Days Live", y="Count")
cars$telclicks <- as.numeric(cars$telclicks)
cars$bids <- as.numeric(cars$bids)
cars$webclicks <- as.numeric(cars$webclicks)
# create total clicks variable
cars <- cars %>% mutate(TotalClicks = (telclicks + webclicks + n_asq +bids))
# create the response variable as label
cars$clicked <- ifelse(cars$TotalClicks > 0, 1, 0)
cars$clicked <- as.factor(cars$clicked)
# create the response variable 10 days totalclicks
##In order to do that, I will exclude day_lives < 3days.
##I will extrapolated between 3-10 days to 10 days
##I will also find the ratio the 10 days
cars$TenDaysClick <- ifelse(cars$days_live < 3, NA,
((10*cars$total_clicks)/cars$days_live))
# examine response variable
## as expected, most clicks fall into 0 or 1
ggplot(data=cars, aes(cars$TotalClicks)) +
geom_histogram(breaks=seq(0, 35, by = 5),
col="red",
fill="green",
alpha = .2) +
labs(title="Histogram for Total Clicks") +
labs(x="Total Clicks", y="Count")
# now that we have our label, lets examine the A/B test results
## create new table with only A/B test results for later analysis
carsAB <- cars %>% filter(group == "A" | group == "B")
carsA <- cars %>% filter(group == "A")
carsB <- cars %>% filter(group == "B")
### summary(carsA)
### summary(carsB)
## Examining the Data, the only difference between the groups we found was that
## Group A has Higher Mean Price than Group B
t.test(price ~ group, data = carsAB, alternative = "less")
##
## Welch Two Sample t-test
##
## data: price by group
## t = 2.4035, df = 169663, p-value = 0.9919
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf 18349.16
## sample estimates:
## mean in group A mean in group B
## 106457.10 95563.26
# hypothesis seems to be that price will affect our click rate on ads
# lets test this out, group A has significantly less clicks than group B
t.test(TotalClicks ~ group, data = carsAB, alternative= "greater")
##
## Welch Two Sample t-test
##
## data: TotalClicks by group
## t = -8.079, df = 167608, p-value = 1
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -0.432395 Inf
## sample estimates:
## mean in group A mean in group B
## 5.150539 5.509792
# Looks like the groups may be split up to see impact of clicks by price
## lets visualize what that looks like
ggplotscaled<-ggplot(na.omit(carsAB), aes(x = scale(TotalClicks), y = scale(price), color = group)) +
geom_point() +
labs(title = "Clicks by Price in A/B Test (Scaled)") +
labs(color = "Test Groups") +
labs(x = "Total Clicks", y = "Price")
Clicks by Price
# Calculating the confidence intervals for each group by total clicks now
## Group A
errorTCA <- qt(0.90, df=length(carsA$TotalClicks) - 1) * sd(carsA$TotalClicks, na.rm = T) / sqrt(length(carsA$TotalClicks))
leftTCA <- mean(carsA$TotalClicks, na.rm = T) - errorTCA
rightTCA<- mean(carsA$TotalClicks, na.rm = T) + errorTCA
leftTCA; rightTCA
## [1] 5.112747
## [1] 5.188331
# Group B
errorTCB <- qt(0.90, df=length(carsB$TotalClicks) - 1) * sd(carsB$TotalClicks, na.rm = T) / sqrt(length(carsB$TotalClicks))
leftTCB <- mean(carsB$TotalClicks, na.rm = T) - errorTCB
rightTCB <- mean(carsB$TotalClicks, na.rm = T) + errorTCB
leftTCB; rightTCB
## [1] 5.467138
## [1] 5.552446
# Calculate Click change based on price in groups
clicksA <- (leftTCA+rightTCA)/2
clicksB <- (leftTCB+rightTCB)/2
clicksTot <- clicksA + clicksB
conversion <- clicksA/clicksTot - clicksB/clicksTot
rate <- round(conversion * 100, 2)
# Receieved 3.37% less clicks in Group A on Average
rate
## [1] -3.37
# Examine Three Charts Together
## Standard Deviation of Clicks through Days Live
## Median Clicks through Days Live
## Count of Clicks through Days Live
### Again, high amount of observations are on the Hatback and throughout the month decreases
### The abundance of hatchbacks in the early days will skew our A/B Test results for any inference
DaysLiveGroup <- group_by(days_live, carrosserie, .data = carsAB)
DaysClicks <- summarise(DaysLiveGroup,
sdClicks = sd(TotalClicks, na.rm = T),
medianClicks = median(TotalClicks, na.rm = T),
count = n())
p1 <- ggplot(DaysClicks) +
geom_smooth(aes(x=days_live, y=sdClicks, color=carrosserie), se = F) +
xlim(0,30) +
labs(color = "Vehicles") +
labs(x = "Days Live", y = "Deviation of Clicks")
p2 <- ggplot(DaysClicks) +
geom_smooth(aes(x=days_live, y=medianClicks, color=carrosserie), se = F) +
xlim(0,30) +
labs(color = "Vehicles") +
labs(x = "Days Live", y = "Median of Clicks")
p3 <- ggplot(DaysClicks) +
geom_smooth(aes(x=days_live, y=count, color=carrosserie), se = F) +
xlim(0,30) +
labs(color = "Vehicles") +
labs(x = "Days Live", y = "Count of Clicks")
grid.arrange(p1, p2, p3, ncol = 1)
# Created an interactive graph so we can play with the data
## lets examine the count data for the entire lifecycle
### Hatchbacks highly popular within test groups
CarsCount <- ggplot(DaysClicks) +
geom_smooth(aes(x=days_live, y=count, color=carrosserie), se = F) +
xlim(0,150) +
labs(title = "Clicks per Vehicle by Days Live") +
labs(color = "Vehicles") +
labs(x = "Days Live", y = "Number of Clicks")
ggplotly(CarsCount)
# Examine some summary statistics of the Hatcback
## it is below the mean/median price of the group
## it has less power than the average car in the group
## it has less kilometers ran on average even though the age is about the same as group
carsAB %>% filter(carrosserie == "Hatchback (3/5-deurs)") %>%
select(photo_cnt, vermogen, price, age, kmstand, group) %>%
summary()
## photo_cnt vermogen price age
## Min. : 0.00 Min. : 0.00 Min. : 0 Min. : 3.00
## 1st Qu.:10.00 1st Qu.: 50.00 1st Qu.: 17500 1st Qu.: 7.00
## Median :14.00 Median : 60.00 Median : 49500 Median :11.00
## Mean :14.98 Mean : 66.49 Mean : 73295 Mean :11.65
## 3rd Qu.:21.00 3rd Qu.: 77.00 3rd Qu.: 99000 3rd Qu.:16.00
## Max. :24.00 Max. :478.00 Max. :99999990 Max. :28.00
## NA's :3
## kmstand group
## Min. : 0 Length:76156
## 1st Qu.: 59494 Class :character
## Median : 113978 Mode :character
## Mean : 121327
## 3rd Qu.: 172052
## Max. :1455145
## NA's :512
carsAB %>% filter(carrosserie != "Hatchback (3/5-deurs)") %>%
select(photo_cnt, vermogen, price, age, kmstand, group) %>%
summary()
## photo_cnt vermogen price age
## Min. : 0.00 Min. : 0.0 Min. : 0 Min. : 3.00
## 1st Qu.:11.00 1st Qu.: 77.0 1st Qu.: 25000 1st Qu.: 8.00
## Median :17.00 Median : 92.0 Median : 69500 Median :12.00
## Mean :16.54 Mean :103.9 Mean : 116992 Mean :12.08
## 3rd Qu.:24.00 3rd Qu.:118.0 3rd Qu.: 152000 3rd Qu.:16.00
## Max. :24.00 Max. :541.0 Max. :99999990 Max. :28.00
## NA's :2
## kmstand group
## Min. : 0 Length:81329
## 1st Qu.: 93037 Class :character
## Median : 154642 Mode :character
## Mean : 158874
## 3rd Qu.: 215856
## Max. :1120000
## NA's :479
# New table with more evenly distributed car data
carsABRecent <- carsAB %>% filter(carrosserie != "Hatchback (3/5-deurs)")
t.test(TotalClicks ~ group, data = carsABRecent)
##
## Welch Two Sample t-test
##
## data: TotalClicks by group
## t = -6.7353, df = 78273, p-value = 1.647e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.5129820 -0.2817207
## sample estimates:
## mean in group A mean in group B
## 4.872662 5.270013
# Calculate Click change based on price in groups
clicksAB <- 4.872662
clicksBA <- 5.270013
clicksTots <- clicksAB + clicksBA
conversion_smooth <- clicksAB/clicksTots - clicksBA/clicksTots
rate_smooth <- round(conversion_smooth * 100, 2)
# Receieved 3.92% less clicks in Group A on Average
# Customers in these samples are more likely to click on cheap cars
# Even when we remove the cheap Hatchback as a highly popular option
rate_smooth
## [1] -3.92
# Having concluded our A/B Analysis, lets go back to our main data
# After examining the variables, we found that many had a "?" or "None" field as factors
# so clean some of the missing/dirty data from these features
cars$kleur <- as.factor(ifelse(cars$kleur == ".",
"Other", cars$kleur))
cars$carrosserie <- as.factor(ifelse(cars$carrosserie == ".",
"Other", cars$carrosserie))
cars$aantaldeuren <- as.factor(ifelse(cars$aantaldeuren == ".",
"Other", cars$aantaldeuren))
cars$energielabel <- as.factor(ifelse(cars$energielabel == ".",
"Other", cars$energielabel))
cars$aantalstoelen <- as.factor(ifelse(is.na(cars$aantalstoelen),
"Other", cars$aantalstoelen))
cars$photo_cnt <- as.factor(cars$photo_cnt)
cars$emissie <- as.numeric(cars$emissie)
# Drop out any price that is unrealistic
# €0 for a car, or 100 million for a Volvo, etc.
cars$price <- ifelse(cars$price < quantile(cars$price, 0.05, na.rm = T), NA,
ifelse(cars$price > quantile(cars$price, 0.98, na.rm = T), NA, cars$price))
# "Model" alone has no predictive power but combined with the brand it may
# Combine the Brand and Model of Cars
# Now we can drop "Model" as its mostly noise for our algorithm
cars$brand <- str_replace_all(cars$brand, pattern = "[[:punct:]]", "")
cars$brand <- str_replace_all(cars$brand, pattern = "\\s+", " ")
cars$label <- as.factor(paste(cars$brand, cars$model, sep = " "))
cars$label <- str_replace_all(cars$label, pattern = "[[:punct:]]", "")
cars$label <- str_replace_all(cars$label, pattern = "\\s+", " ")
# Let examine our data and see whats popular for our ads
# Format the Cars Labels
cars$label <- as.factor(tolower(cars$label))
AllLabels <- str_split(cars$label, " ")
# how many words per label
WordsPerLabel <- sapply(AllLabels, length)
# table of frequencies
table(WordsPerLabel)
## WordsPerLabel
## 2 3 4 5 6 7 8 9 10 11
## 163745 17985 848 370 29 21 11 11 6 8
## 12 13 14 15 16 18 19
## 4 10 2 5 4 2 1
# to get it as a percent
100 * round(table(WordsPerLabel)/length(WordsPerLabel), 4)
## WordsPerLabel
## 2 3 4 5 6 7 8 9 10 11 12 13
## 89.45 9.82 0.46 0.20 0.02 0.01 0.01 0.01 0.00 0.00 0.00 0.01
## 14 15 16 18 19
## 0.00 0.00 0.00 0.00 0.00
# vector of words in labels
TitleWords <- unlist(AllLabels)
# get unique words
UniqueWords <- unique(TitleWords)
NumUniqueWords <- length(unique(TitleWords))
# vector to store counts
CountWords <- rep(0, NumUniqueWords)
# count number of occurrences
for (i in 1:NumUniqueWords) {
CountWords[i] = sum(TitleWords == UniqueWords[i])
}
# index values in decreasing order
Top30Order <- order(CountWords, decreasing = TRUE)[1:30]
# top 30 frequencies
Top30Freqs <- sort(CountWords, decreasing = TRUE)[1:30]
# select top 30 words
Top30Words <- UniqueWords[Top30Order]
# barplot
## Volkswagen seems to be far ahead of the others
barplot(Top30Freqs, border = NA, names.arg = Top30Words,
las = 2, ylim = c(0,25000))
# Lets see what vehicle type relates to the highest brand
## similar to our AB test results of Hatchback
## the three most popular cars all have Hatchback types
cars %>%
group_by(brand, carrosserie) %>%
mutate(count = n()) %>%
select(brand, carrosserie, count) %>%
arrange(desc(count)) %>%
unique() %>%
head()
## # A tibble: 6 x 3
## # Groups: brand, carrosserie [6]
## brand carrosserie count
## <chr> <fct> <int>
## 1 VOLKSWAGEN Hatchback (3/5-deurs) 11075
## 2 PEUGEOT Hatchback (3/5-deurs) 8485
## 3 OPEL Hatchback (3/5-deurs) 6815
## 4 FORD Hatchback (3/5-deurs) 6196
## 5 RENAULT Hatchback (3/5-deurs) 5908
## 6 FIAT Hatchback (3/5-deurs) 5194
# Other features creating
## We can bin certain variables if they are worthwhile
## Vermogen can be split into High,Low Power
## Emissiens can be split into High, Low Emission Cars
cars %>% select(age, price, kmstand, vermogen, days_live, TotalClicks, emissie) %>%
cor(use = "complete.obs") %>% corrplot::corrplot()
# Features to Drop
# model has too many factors with no value, date was used in Age
# l2 is unknown but mostly noise
table(cars$l2)
##
## 100 101 103 105 108 110 111 112 113 114 115 117 118 119 122
## 245 1948 519 446 81 7 2398 3274 26 513 992 180 118 882 2
## 123 124 125 127 128 129 130 131 132 133 134 135 138 139 140
## 100 267 100 2 15 781 2705 2 47 545 607 986 4719 45 3995
## 143 144 146 147 148 150 151 152 153 154 155 157 158 159 2148
## 6 186 3927 104 280 1908 661 266 162 1242 1870 8072 2038 91 11
## 2149 2150 2151 2152 2153 2155 2156 2659 2660 2829 2830 2831 92 93 95
## 3 2 26 2 2 1 1 5 93 4 9 1 634 2872 5273
## 96 97 98 99
## 3687 4 25 470
# Select final features, drop ones we won't use or could cause data leakage
features <- cars %>%
select(- `group`, - model, - ad_start_dt,
- src_ad_id, - bouwjaar, - l2, - webclicks, - telclicks, - TotalClicks,
- n_asq, - bids)
str(features)
## 'data.frame': 183062 obs. of 19 variables:
## $ kleur : Factor w/ 9 levels "Beige","Blauw",..: 8 8 8 9 8 9 6 4 4 8 ...
## $ carrosserie : Factor w/ 8 levels "Cabriolet","Coup",..: 3 3 7 6 4 7 1 8 3 NA ...
## $ kmstand : num 44958 25072 301409 194481 238101 ...
## $ days_live : num 31 31 31 31 9 8 2 31 31 100 ...
## $ photo_cnt : Factor w/ 25 levels "0","1","2","3",..: 17 23 13 22 9 14 25 15 19 12 ...
## $ aantaldeuren : Factor w/ 9 levels "0","1","2","3",..: 4 6 6 5 5 6 3 6 6 6 ...
## $ emissie : num 105 122 132 314 178 140 145 128 0 156 ...
## $ energielabel : Factor w/ 7 levels "2","3","4","5",..: 1 3 2 7 4 3 3 3 3 2 ...
## $ brand : chr "VOLKSWAGEN" "VOLKSWAGEN" "OPEL" "AUDI" ...
## $ vermogen : num 44 92 59 331 79 74 97 81 80 103 ...
## $ aantalstoelen : Factor w/ 33 levels "1","10","116 km NAP",..: 14 16 16 16 16 14 14 16 16 16 ...
## $ price : num 79500 189500 27500 199000 12500 ...
## $ age : num 6 4 15 12 17 13 4 3 18 12 ...
## $ annual_emissions: num 17.5 30.5 8.8 26.2 10.5 ...
## $ annual_kms : num 7493 6268 20094 16207 14006 ...
## $ ageGroup : Factor w/ 6 levels "(<=3)","(11-15)",..: 5 5 2 2 3 2 5 1 3 2 ...
## $ clicked : Factor w/ 2 levels "0","1": 2 2 2 2 1 2 2 1 1 2 ...
## $ TenDaysClick : num NA NA NA NA NA NA NA NA NA NA ...
## $ label : Factor w/ 2332 levels "ackermannfruehauf na",..: 2283 2216 1590 134 1878 1979 1855 1990 1677 750 ...
## Examine the Machine Learning Algorithms we will use
# H2O library was used for performance gains
# Algorithms that can effectively handle NA's were used (RF Imputation was used with no difference)
# Algorithms that can effectively scale were used (YeoJohnson was used with no difference)
carsh2o <- as.h2o(features)
# Split into Training/Validation/Testing sets
splits <- h2o.splitFrame(data = carsh2o, ratios = c(0.7, 0.15), seed = 1)
train <- splits[[1]]
validate <- splits[[2]]
test <- splits[[3]]
# Define Label and Predictors
response <- "TenDaysClick"
predictors <- setdiff(names(train), response)
# Define as Factor since we want to know if its a Click (1) or Not (0)
train[,response] <- as.factor(train[,response])
validate[,response] <- as.factor(validate[,response])
test[,response] <- as.factor(test[,response])
# GBM Algorithm with minor human tuning
# One Hot Encoded Variables as it usually improves RMSE
gbmFit <- h2o.gbm(x = predictors,
y = response,
training_frame = train,
model_id = "gbmFit",
validation_frame = validate,
ntrees = 500,
score_tree_interval = 5,
stopping_rounds = 3,
stopping_metric = "RMSE",
stopping_tolerance = 0.0005,
categorical_encoding = "OneHotExplicit",
seed = 1)
gbmPerf <- h2o.performance(model = gbmFit,
newdata = test)
gbmPerftr <- h2o.performance(model = gbmFit,
newdata = train)
print(gbmPerf)
print(gbmPerftr)
# Distributed RandomForest
rfFit <- h2o.randomForest(x = predictors,
y = response,
training_frame = train,
model_id = "rfFit",
seed = 1,
nfolds = 5)
rfPerf <- h2o.performance(model = rfFit,
newdata = test)
rfPerftr <- h2o.performance(model = rfFit,
newdata = train)
print(rfPerf)
print(rfPerftr)
# Generalized Linear Model with gaussian Family
glmFit <- h2o.glm( x = predictors,
y = response,
training_frame = train,
model_id = "glmFit",
validation_frame = validate,
family = "gaussian",
lambda_search = TRUE)
glmPerf <- h2o.performance(model = glmFit,
newdata = test)
glmPerftr <- h2o.performance(model = glmFit,
newdata = train)
print(glmPerf)
print(glmPerftr)
# Lets examine the results based on RMSE
# RF/GBM performed quite close
# GBM had the best RMSE but it was also the slowest
# RMSE
rfper <-h2o.rmse(rf_perf) # 7.471413
gbmper <- h2o.rmse(gbm_perf) # 7.70153
glmper <- h2o.rmse(glm_perf) # 10.11961
rfpertr <-h2o.rmse(rf_perf_train) # 3.467813
gbmpertr <- h2o.rmse(gbm_perf_train) # 7.604088
glmpertr <- h2o.rmse(glm_perf_train) # 10.68962
rf <- cbind(rfper, rfpertr)
gbm <- cbind(gbmper, gbmpertr)
glm <- cbind(glmper, glmpertr)
final <- rbind(rf,gbm,glm)
colnames(final) <- c("Test","Train")
rownames(final) <- c("RF","GBM","GLM")
print(final)
Results are below
| Test | Train | |
|---|---|---|
| RF | 7.471413 | 3.467813 |
| GBM | 7.701530 | 7.604088 |
| GLM | 10.119612 | 10.689616 |
RF is always generate good result. But we have to careful when we select it. Because in Train, it looks like overfitting. Therefore I will selecet GBM and do hyperparameter tuning to it.
# Variable Importance
# We see that for GBM the new feature (Age) we created was the most important
# for RF age was the 5th most important
print(gbm_fit@model$variable_importances)
print(rf_fit@model$variable_importances)
Variable Importances for GBM:
| variable relative_importance scaled_importance percentage | ||||
|---|---|---|---|---|
| 1 | days_live | 16982456.000000 | 1.000000 | 0.501079 |
| 2 | price | 5190817.000000 | 0.305658 | 0.153159 |
| 3 | clicked.0 | 4012160.500000 | 0.236253 | 0.118382 |
| 4 | age | 1407525.750000 | 0.082881 | 0.041530 |
| 5 | label.volkswagen polo | 725675.562500 | 0.042731 | 0.021412 |
| variable | relative_importance | scaled_importance | percentage | |
|---|---|---|---|---|
| 2446 | label.westfield cabriolet | 0.000000 | 0.000000 | 0.000000 |
| 2447 | label.westfield se k6 | 0.000000 | 0.000000 | 0.000000 |
| 2448 | label.westfield westfield | 0.000000 | 0.000000 | 0.000000 |
| 2449 | label.xxtrail na | 0.000000 | 0.000000 | 0.000000 |
| 2450 | label.zundapp fabia | 0.000000 | 0.000000 | 0.000000 |
| 2451 | label.missing(NA) | 0.000000 | 0.000000 | 0.000000 |
Variable Importances for RF:
| variable | relative_importance | scaled_importance | percentage | |
|---|---|---|---|---|
| 1 | days_live | 102245984.000000 | 1.000000 | 0.239505 |
| 2 | photo_cnt | 48573184.000000 | 0.475062 | 0.113780 |
| 3 | price | 43279472.000000 | 0.423288 | 0.101379 |
| 4 | label | 29833160.000000 | 0.291778 | 0.069882 |
| 5 | age | 23406868.000000 | 0.228927 | 0.054829 |
| 6 | kleur | 19365180.000000 | 0.189398 | 0.045362 |
| 7 | kmstand | 19202722.000000 | 0.187809 | 0.044981 |
| 8 | clicked | 19151188.000000 | 0.187305 | 0.044860 |
| 9 | energielabel | 17910164.000000 | 0.175167 | 0.041953 |
| 10 | annual_emissions | 17643446.000000 | 0.172559 | 0.041329 |
| 11 | annual_kms | 16709458.000000 | 0.163424 | 0.039141 |
| 12 | vermogen | 14400799.000000 | 0.140845 | 0.033733 |
| 13 | aantaldeuren | 13734035.000000 | 0.134323 | 0.032171 |
| 14 | emissie | 13316308.000000 | 0.130238 | 0.031193 |
| 15 | carrosserie | 13147768.000000 | 0.128590 | 0.030798 |
| 16 | ageGroup | 8611877.000000 | 0.084227 | 0.020173 |
| 17 | aantalstoelen | 6373984.500000 | 0.062340 | 0.014931 |
# Since GBM was the best performer lets tune it
# Hyper Parameter Tuning
# First Pass
hyper_params = list( max_depth = seq(1,29,2) ) # Since dataqset is small
grid <- h2o.grid(
hyper_params = hyper_params,
## full Cartesian hyper-parameter search
search_criteria = list(strategy = "Cartesian"),
algorithm="gbm",
grid_id="depth_grid",
x = predictors,
y = response,
training_frame = train,
validation_frame = validate,
## here, use "more than enough" trees - we have early stopping
ntrees = 10000,
## since we have learning_rate_annealing, we can afford to start with a bigger learning rate
learn_rate = 0.05,
## learning rate annealing: learning_rate shrinks by 1% after every tree
learn_rate_annealing = 0.99,
sample_rate = 0.8,
col_sample_rate = 0.8,
seed = 1234,
## early stopping once the validation AUC doesn't improve by at least 0.01% for 5 consecutive scoring events
stopping_rounds = 5,
stopping_tolerance = 1e-4,
stopping_metric = "RMSE",
## score every 10 trees to make early stopping reproducible (it depends on the scoring interval)
score_tree_interval = 10
)
## sort the grid models by decreasing AUC
sortedGrid <- h2o.getGrid("depth_grid", sort_by="rmse", decreasing = TRUE)
## find the range of max_depth for the top 5 models
topDepths = sortedGrid@summary_table$max_depth[1:5]
minDepth = min(as.numeric(topDepths))
maxDepth = max(as.numeric(topDepths))
# Now that we know a good range for max_depth,
# we can tune all other parameters in more detail
# Since we don’t know what combinations of hyper-parameters will result in the best model,
# we’ll use random hyper-parameter search
hyper_params = list(
## restrict the search to the range of max_depth established above
max_depth = seq(minDepth,maxDepth,1),
sample_rate = seq(0.2,1,0.01),
col_sample_rate = seq(0.2,1,0.01),
col_sample_rate_per_tree = seq(0.2,1,0.01),
col_sample_rate_change_per_level = seq(0.9,1.1,0.01),
min_rows = 2^seq(0,log2(nrow(train))-1,1),
nbins = 2^seq(4,10,1),
nbins_cats = 2^seq(4,12,1),
min_split_improvement = c(0,1e-8,1e-6,1e-4),
histogram_type = c("UniformAdaptive","QuantilesGlobal","RoundRobin")
)
search_criteria = list(
## Random grid search
strategy = "RandomDiscrete",
## limit the runtime to 60 minutes
max_runtime_secs = 3600,
## build no more than 100 models
max_models = 100,
seed = 1234,
## early stopping once the leaderboard of the top 5 models is converged to 0.1% relative difference
stopping_rounds = 5,
stopping_metric = "RMSE",
stopping_tolerance = 1e-3
)
grid <- h2o.grid(
hyper_params = hyper_params,
search_criteria = search_criteria,
algorithm = "gbm",
grid_id = "final_grid",
x = predictors,
y = response,
training_frame = train,
validation_frame = validate,
ntrees = 10000,
learn_rate = 0.05,
learn_rate_annealing = 0.99,
max_runtime_secs = 3600,
stopping_rounds = 5, stopping_tolerance = 1e-4, stopping_metric = "RMSE",
score_tree_interval = 10,
nfolds = 5,
seed = 1234
)
## Sort the grid models by RMSE
sortedGrid <- h2o.getGrid("final_grid", sort_by = "rmse", decreasing = TRUE)
print(sortedGrid)
# Choose Best Model
gbm <- h2o.getModel(sortedGrid@model_ids[[1]])
print(h2o.rmse(h2o.performance(gbm, newdata = test)))
Final RMSE is 6.924783
# Keeping the same “best” model,
# we can make test set predictions as follows:
preds <- h2o.predict(gbm, test)
head(preds, 10)
summary(preds)
# Final GBM Metrics
print(gbm@model$validation_metrics@metrics$r2)
R2 is 0.500924
# Save Model and Predictions
h2o.saveModel(gbm, "/Users/mustafaozturk/Desktop/eBay Case/best_model.csv", force=TRUE)
h2o.exportFile(preds, "/Users/mustafaozturk/Desktop/eBay Case/best_preds.csv", force=TRUE)
|```